# ==============================================================================
# file name: 00-queries-traces.sql
# authors: Bernhard Clemm 
# description: queries the trace data; you do not need this for reproduction 
#              (and cannot use it because it requires access to data base)
# ==============================================================================

-----------------------------------------------------------------
-- this creates a table with timestamps in UTC
-----------------------------------------------------------------

create table people_morenonews_timestamps as
select country, person_id, timezone, w2_utc, w3_utc,
-- create time stamp of one month before start of pre-wave
w2_utc - interval '1' month w2_utc_month_before, 
w3_utc - interval '1' month w3_utc_month_before,
no_news_utc, more_news_utc
from
(select country, person_id, timezone,
case
when country = 'US' then cast(concat(cast(w2 as varchar), ' America/Denver') as timestamp)
when country = 'PL' then cast(concat(cast(w2 as varchar), ' Europe/Warsaw') as timestamp) 
end as w2_utc,
case
when country = 'US' then cast(concat(cast(w3 as varchar), ' America/Denver') as timestamp)
when country = 'PL' then cast(concat(cast(w3 as varchar), ' Europe/Warsaw') as timestamp) 
end as w3_utc,
no_news,
cast(concat(cast(no_news as varchar), ' America/Denver') as timestamp) no_news_utc, 
cast(concat(cast(more_news as varchar), ' Europe/Warsaw') as timestamp) more_news_utc
from person_survey_timestamp
-- only include US subjects who started W3 and PL subjects who started W2
where (w2 is not null and country = 'PL') OR (w3 is not null and country = 'US'))

--------------------------------------------------------------------
-- this creates two domain tables with median values for platforms, 
-- to have fewer missings on prior news ideo
--------------------------------------------------------------------

create table domains_news_us_morenonews as
select source, score_domain,
case 
when score_domain = 'bing.com/news' then  (select approx_percentile(wilson, 0.5) from domains_news_us)
when score_domain = 'aol.com/news' then  (select approx_percentile(wilson, 0.5) from domains_news_us)
else wilson
end wilson_with_platforms
from domains_news_us

create table domains_news_pl_morenonews as
select 
domain, 
case 
when domain = 'onet.pl' then  (select approx_percentile(ideology_estimate, 0.5) from domains_news_pl)
when domain = 'o2.pl' then  (select approx_percentile(ideology_estimate, 0.5) from domains_news_pl)
else ideology_estimate
end ideo_with_platforms
from domains_news_pl

--------------------------------------------------------------------
-- this summarizes the data on person level, by the four periods
--------------------------------------------------------------------
-- - month before w2 (only relevant for PL)
-- - during more news (only relevant for PL)
-- - month before w3 (only relevant for US)
-- - during no news (only relevant for US)

-- not that this also includes US W3 people and PL W2 people who did not participate in experiments (just for reference)

CREATE TABLE people_morenonews_totals AS
SELECT country, person_id, timespan, 
count(*) u_visits, -- counts visits during period
count(distinct(created_local_date)) active_days, -- counts active days during period
SUM(case when news_id is not null then 1 else 0 end) news_visits, -- counts news visits during period
AVG(ideo) news_ideo, -- counts news ideology during period
AVG(ideo_alt) news_ideo_alt,
SUM(political) pol_visits,
SUM(case when news_id is not null and political = 1 then 1 else 0 end) pol_news_visits
from
(select country, person_id, id, news_id, score_domain, ideo, ideo_alt, political, created_local, date(created_local) created_local_date,
w2_month_before_local, w2_local, w3_month_before_local, w3_local, no_news_local, more_news_local,
case -- this defined the period
when created_local > w2_month_before_local and created_local < w2_local then 'month_before_w2'
when created_local > w3_month_before_local and created_local < w3_local then 'month_before_w3'
when created_local > w3_local and created_local < no_news_local then 'during_no_news'
when created_local > w2_local and created_local < more_news_local then 'during_more_news'
end as timespan
from
(select v.*, 
case -- since some a domain might be same in PL/US, but have different ideo score, this assigns the right score:
when v.country = 'US' then d_us.wilson
when v.country = 'PL' then d_pl.ideology_estimate
end ideo,
case -- alternative ideology score
when v.country = 'US' then d_us_alt.wilson_with_platforms
when v.country = 'PL' then d_pl_alt.ideo_with_platforms
end ideo_alt
from
(select v.id, v.person_id, v.country, 
case
when v.country = 'US' then n_us.id
when v.country = 'PL' then n_pl.id
end as news_id,
case
when v.country = 'US' then n_us.score_domain
when v.country = 'PL' then n_pl.score_domain
end as score_domain,
pc.political,
-- this turns all timestamps into local time (defined by "timezone"), so that active days are defined in terms of actual days
at_timezone(t.w2_utc_month_before, t.timezone) w2_month_before_local,
at_timezone(t.w2_utc, t.timezone) w2_local,
at_timezone(t.w3_utc_month_before, t.timezone) w3_month_before_local,
at_timezone(t.w3_utc, t.timezone) w3_local,
at_timezone(t.no_news_utc, t.timezone) no_news_local,
at_timezone(t.more_news_utc, t.timezone) more_news_local,
at_timezone(v.created_utc, t.timezone) created_local
from visits_u_clean_url v
-- this joins the timestamps
LEFT JOIN people_morenonews_timestamps t ON v.person_id = t.person_id
-- this joins the news visits
LEFT JOIN visits_news_all_pl n_pl ON v.id = n_pl.id 
LEFT JOIN visits_news_all_us n_us ON v.id = n_us.id 
-- this joins political classifier
LEFT JOIN titles_all a on v.title = a.title
LEFT JOIN titles_pol_class pc on a.min_id = pc.min_id
where t.person_id is not null) v
-- this joins the ideo scores per domain
LEFT JOIN domains_news_pl d_pl ON v.score_domain = d_pl.domain
LEFT JOIN domains_news_us d_us ON v.score_domain = d_us.score_domain
-- alternative ideo values
LEFT JOIN domains_news_pl_morenonews d_pl_alt ON v.score_domain = d_pl_alt.domain
LEFT JOIN domains_news_us_morenonews d_us_alt ON v.score_domain = d_us_alt.score_domain))
-- this summarizes the data
group by person_id, country, timespan
order by person_id, timespan


--------------------------------------------------------------------
-- domain visits descriptives
--------------------------------------------------------------------

select country, score_domain, ideo, count(*) count from
(select country, person_id, id, news_id, score_domain, ideo, ideo_alt, created_local, date(created_local) created_local_date,
w2_month_before_local, w2_local, w3_month_before_local, w3_local, no_news_local, more_news_local,
case -- this defined the period
when created_local > w2_month_before_local and created_local < w2_local then 'month_before_w2'
when created_local > w3_month_before_local and created_local < w3_local then 'month_before_w3'
when created_local > w3_local and created_local < no_news_local then 'during_no_news'
when created_local > w2_local and created_local < more_news_local then 'during_more_news'
end as timespan
from
(select v.*, 
case -- since some a domain might be same in PL/US, but have different ideo score, this assigns the right score:
when v.country = 'US' then d_us.wilson
when v.country = 'PL' then d_pl.ideology_estimate
end ideo,
case -- alternative ideology score
when v.country = 'US' then d_us_alt.wilson_with_platforms
when v.country = 'PL' then d_pl_alt.ideo_with_platforms
end ideo_alt
from
(select v.id, v.person_id, v.country, 
case
when v.country = 'US' then n_us.id
when v.country = 'PL' then n_pl.id
end as news_id,
case
when v.country = 'US' then n_us.score_domain
when v.country = 'PL' then n_pl.score_domain
end as score_domain,
-- this turns all timestamps into local time (defined by "timezone"), so that active days are defined in terms of actual days
at_timezone(t.w2_utc_month_before, t.timezone) w2_month_before_local,
at_timezone(t.w2_utc, t.timezone) w2_local,
at_timezone(t.w3_utc_month_before, t.timezone) w3_month_before_local,
at_timezone(t.w3_utc, t.timezone) w3_local,
at_timezone(t.no_news_utc, t.timezone) no_news_local,
at_timezone(t.more_news_utc, t.timezone) more_news_local,
at_timezone(v.created_utc, t.timezone) created_local
from visits_u_clean_url v
-- this joins the timestamps
LEFT JOIN people_morenonews_timestamps t ON v.person_id = t.person_id
-- this joins the news visits
LEFT JOIN visits_news_all_pl n_pl ON v.id = n_pl.id 
LEFT JOIN visits_news_all_us n_us ON v.id = n_us.id 
where t.person_id is not null) v
-- this joins the ideo scores per domain
LEFT JOIN domains_news_pl d_pl ON v.score_domain = d_pl.domain
LEFT JOIN domains_news_us d_us ON v.score_domain = d_us.score_domain
-- alternative ideo values
LEFT JOIN domains_news_pl_morenonews d_pl_alt ON v.score_domain = d_pl_alt.domain
LEFT JOIN domains_news_us_morenonews d_us_alt ON v.score_domain = d_us_alt.score_domain))
where (timespan in ('month_before_w2', 'during_more_news') and country = 'PL') or
(timespan in ('month_before_w3', 'during_no_news') and country = 'US')
group by country, score_domain, ideo
order by country, count desc